This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

Loading package and dependencies:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(tidyr)
library(forcats)
library(stringr)

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.

df1 <- read.csv('./kagglesurvey/kaggle_survey_2022_responses.csv')

Section A. Gender Dynamics

Part 1. Yearly compensation

Data prepping and manipulation

head(df1$Q5)
## [1] "Are you currently a student? (high school, university, or graduate)"
## [2] "No"                                                                 
## [3] "No"                                                                 
## [4] "Yes"                                                                
## [5] "No"                                                                 
## [6] "Yes"
 df1 <- df1[-1,]
 head(df1$Q5)
## [1] "No"  "No"  "Yes" "No"  "Yes" "Yes"
## removing the first row
table(df1$Q5)
## 
##    No   Yes 
## 12036 11961
prop.table(table(df1$Q5))
## 
##        No       Yes 
## 0.5015627 0.4984373
#proportion of non-students to students 
table(df1$Q3)
## 
##                     Man               Nonbinary       Prefer not to say 
##                   18266                      78                     334 
## Prefer to self-describe                   Woman 
##                      33                    5286
as.data.frame(prop.table(table(df1$Q3)))
#ratio of gender
table(df1$Q7)
## < table of extent 0 >
# platform poularity
table(df1$Q12_1)
## 
##        Python 
##   5344  18653
data.frame(prop.table(table(df1$Q12_1)))
df_index <- read.csv('./kagglesurvey/kaggle_survey_2022_responses.csv', header=0)
df_index[1:2,]
df_index <- transpose(df_index[1:2,])
helpful_platforms <- df1 %>% select(contains("Q7_"))
data.frame(x=unlist(helpful_platforms))
helpful_platforms <- helpful_platforms %>% mutate_all(na_if, "")
helpful_platforms <- data.frame(data.frame(x=unlist(helpful_platforms)))
data.frame(sort(table(helpful_platforms), decreasing = TRUE))
data.frame(sort(table(df1[,173]), decreasing=TRUE))

Q3 + Q29

gender_pay <- df1 %>% select(c("Q3","Q29"))
#24,000 rows with NULLS
gender_pay <- gender_pay %>% mutate_all(na_if, "")
#clean NAs
gender_pay %>% group_by(Q3) %>% count(Q29)
#data.frame(table(gender_pay$Q29)) %>% arrange(Freq)
gender_pay %>% count(Q3)
gender_paygap <- gender_pay %>% filter(Q3 == "Man" | Q3 == "Woman")
gender_paygap %>% group_by(Q3) %>% count(Q29)

We are almost there, but the dollar signs are messing with the hierarchies…

gender_paygap %>% count(Q29)
gender_paygap %>% mutate(across(starts_with("Q29"), ~gsub("\\$", "", .))) %>% count(Q29) %>% arrange(Q29)
gender_paygap %>% mutate(across(starts_with("Q29"), ~gsub("\\$", "", .))) %>% mutate(across(starts_with("Q29"), ~gsub("\\,", "", .))) %>% count(Q29) %>% arrange(Q29)
gender_paygap <- gender_paygap %>% mutate(across(starts_with("Q29"), ~gsub("\\$", "", .))) %>% mutate(across(starts_with("Q29"), ~gsub("\\,", "", .)))
# stripping out "$" and "commas"
gender_paygap_range <- gender_paygap %>% separate(Q29, c("MINsal", "MAXsal"))
#string split by "-"
millionaires <- gender_paygap %>% filter(Q29 == ">1000000")
#as.numeric(gender_paygap_range$MINsal)
as.numeric(millionaires$Q29)
## Warning: NAs introduced by coercion
##  [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
#million+ earners would get NA-coerced by data type transformation
gender_paygap <- gender_paygap %>% filter(Q29 != ">1000000")
#remove millionaires
gender_paygap <- gender_paygap %>% separate(Q29, c("MINsal", "MAXsal"))
#string split
gender_paygap$MINsal <- as.numeric(gender_paygap$MINsal)
gender_paygap$MAXsal <- as.numeric(gender_paygap$MAXsal)
#convert to numeric data
salary_data <- data.frame(gender_paygap)
salary_data
gender_paygap %>% 
  mutate(
    salary_range = paste0 (
  format(MINsal, trim = TRUE),
  "-", 
  MAXsal
),
salary_range = fct_reorder(salary_range, MINsal)
  )
gender_paygap <- gender_paygap %>% 
  mutate(
    salary_range = paste0 (
  format(MINsal, trim = TRUE),
  "-", 
  MAXsal
),
salary_range = fct_reorder(salary_range, MINsal)
  )

#drop the NAs, then rename gender column
gender_paygap <- gender_paygap %>% drop_na()
gender_paygap <- gender_paygap %>% rename("gender" =  "Q3")
gender_paygap %>% arrange(desc(salary_range))
aggr_gpg <- gender_paygap %>% group_by(gender) %>% count(salary_range)

Mock-up visualizations

ggplot(aggr_gpg, aes(x=salary_range, y=n, fill=gender)) +
  
geom_bar(data=subset(aggr_gpg, gender == "Man"), stat="identity") +

geom_bar(data=subset(aggr_gpg, gender == "Woman"), stat="identity", aes(y=-n)) +
  
coord_flip() + scale_fill_manual(values=c("lightblue", "pink"))

 gender_paygap %>% group_by(gender) %>% summarise(count = n()/nrow(.))
#gender ratio of gender_paygap df
gender_demo <- gender_paygap %>% group_by(gender) %>% summarise(percentage = round(n()/nrow(.),4)*100, lab.pos = cumsum(percentage)-.5*percentage)
ggplot(gender_demo, aes(x=1, y=percentage, fill=gender)) + 
  geom_bar(stat="identity") +
  coord_polar("y", start = 0) +
  geom_text(aes(y = lab.pos, label = paste(percentage,"%", sep = "")), col = "white") + theme_void() +
  scale_fill_manual(values=c("lightblue", "lightpink")) +
  xlim(-1, 2.5)

#ggsave("gender_demo_white.png")
aggr_gpg2 <- gender_paygap %>% group_by(salary_range, gender) %>% summarise(n = n()) %>% mutate(freq = n /sum(n))
## `summarise()` has grouped output by 'salary_range'. You can override using the
## `.groups` argument.
ggplot(aggr_gpg2, aes(x=salary_range, y=freq, fill=gender)) +
  
geom_bar(data=subset(aggr_gpg2, gender == "Man"), stat="identity") +

geom_bar(data=subset(aggr_gpg2, gender == "Woman"), stat="identity", aes(y=-freq)) +

geom_hline(yintercept = 0, linetype="dotted", alpha=0.6) +

# Accuracy of y-axis 
scale_y_continuous( labels=c("30%","0%","30%","60%","90%"))  +
  
coord_flip() + scale_fill_manual(values=c("lightblue", "pink")) + theme_minimal() +
  
labs(x = "salary range (USD)", y = "percent share")

#ggsave("gendersalaryratio.png")     
gender_paygap %>% group_by(salary_range, gender) %>% summarise(n = n()) %>% mutate(freq = n /sum(n))
## `summarise()` has grouped output by 'salary_range'. You can override using the
## `.groups` argument.
gender_paygap %>% group_by(gender) %>% summarise(n = n()) %>% mutate(freq = n /sum(n))
#gender ratio and relative group percentage

Section A. Gender Dynamics

Part 2. Job role popularity

Data prepping and manipulation

df1 %>% select(Q23) %>% count(Q23)
gender_job <- df1 %>% select(c("Q3", "Q23"))
gender_job <- gender_job %>% mutate_all(na_if, "")
gender_job <- gender_job %>% filter(Q3 == "Man" | Q3 == "Woman")
gender_job <- gender_job %>% filter(Q23 != "Currently not employed" & Q23 != "Other")
gender_job <- gender_job %>% drop_na()
#house cleaning code chunk

gender_job <- gender_job %>% mutate(Q23 = replace(Q23, Q23 == "Data Analyst (Business, Marketing, Financial, Quantitative, etc)", "Data Analyst")) %>% mutate(Q23 = replace(Q23, Q23 == "Manager (Program, Project, Operations, Executive-level, etc)", "Manager")) 
#simplifying strings of some job roles

gender_job <- gender_job %>% rename("gender" = "Q3", "job_title" = "Q23")
gender_job %>% select(job_title) %>% count(job_title) %>% arrange(desc(n))
Group statistics
gender_job %>% group_by(job_title) %>% summarise(n = n()) %>% mutate(freq = n /sum(n)) %>% arrange(desc(n))
gender_job_ratio <- gender_job %>% group_by(job_title, gender) %>% summarise(n = n()) %>% mutate(freq = n /sum(n))
## `summarise()` has grouped output by 'job_title'. You can override using the
## `.groups` argument.
gender_job %>% group_by(job_title, gender) %>% summarise(n = n()) %>% mutate(freq = n /sum(n))
## `summarise()` has grouped output by 'job_title'. You can override using the
## `.groups` argument.

Mock-up visualization of gender-job ratio

ggplot(gender_job_ratio, aes(x=job_title, y=freq, fill=gender)) +
  
geom_bar(data=subset(gender_job_ratio, gender == "Man"), stat="identity") +

geom_bar(data=subset(gender_job_ratio, gender == "Woman"), stat="identity", aes(y=-freq)) +

geom_hline(yintercept = 0, linetype="dotted", alpha=0.6) +
# Accuracy of y-axis
scale_y_continuous(breaks=c(-0.25,0,0.25,0.5,0.75),labels=c("25%", "0%", "25%","50%", "75%")) +

coord_flip() + scale_fill_manual(values=c("lightblue", "pink")) + theme_minimal() +
  
labs(x="job title", y="percent share")

#ggsave("genderjobratio.png")

cloud computing exploration

cc_usage <- df1 %>% select(contains(c("Q31")))
cc_enjoyability <- df1 %>% select(Q32)
cc_enjoyability %>% count(Q32) %>% arrange(desc(n))
cc_usage %>% gather("key", "value") %>% group_by(value) %>% summarise(n=n()) %>% arrange(desc(n))

cc spending

cc_spending <- df1 %>% select(c(Q30, Q23))
df1 %>% select(contains(c("Q44"))) %>% gather("key", "value") %>% group_by(value) %>% summarise(n=n()) %>% arrange(desc(n))

### distribution of salary range

salary_range_dist <- gender_paygap %>% select(salary_range) %>% count(salary_range)
ggplot(salary_range_dist, aes(x=salary_range, y=n)) +

  geom_col(fill="skyblue") +
  
  coord_flip() + theme_minimal()

plot above is skewed, need to consolidate to USA-only

next, tidying up data

clean_sal <- df1 %>% select(c("Q4","Q29")) %>% mutate(across(starts_with("Q29"), ~gsub("\\$", "", .))) %>% mutate(across(starts_with("Q29"), ~gsub("\\,", "", .)))
#strip out $ and commas

clean_sal <- clean_sal  %>% filter(Q29 != ">1000000")
#remove millionaires

clean_sal<- clean_sal %>% separate(Q29, c("MINsal", "MAXsal"))
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 15861 rows [1, 2,
## 3, 5, 6, 7, 10, 11, 12, 13, 15, 16, 22, 23, 24, 25, 27, 29, 31, 32, ...].
#string split by "-"
clean_sal

bind clean salaries

global_sal_range <- clean_sal
global_sal_range

Section A

Part 3. Salary Range Distribution

usa_sal_range <- global_sal_range %>% filter(Q4 == "United States of America")
usa_sal_range <- usa_sal_range %>% drop_na()
#drop na, then convert columns to numeric
usa_sal_range$MINsal <- as.numeric(usa_sal_range$MINsal)
usa_sal_range$MAXsal <- as.numeric(usa_sal_range$MAXsal)
#after, create factor column corresponding to original salary range columns
usa_sal_range <- usa_sal_range %>% 
  mutate(
    salary_range = paste0 (
  format(MINsal, trim = TRUE),
  "-", 
  MAXsal
),
salary_range = fct_reorder(salary_range, MINsal)
  )

after all that, now we can attempt to visualize

usa_sal_range
usa_salary <- usa_sal_range %>% select(salary_range) %>% count(salary_range)

ggplot(usa_salary, aes(x=salary_range, y=n)) +

  geom_col(fill="skyblue") +
  
  coord_flip() + theme_minimal() + labs(x="salary range, annual (USD)", y="count")

usa_salary
#filter for non-USA countries, then drop_na
not_usa_salary <- global_sal_range %>% filter(Q4 != "United States of America")
not_usa_salary <- not_usa_salary %>% drop_na()

#convert to numeric dtype
not_usa_salary$MINsal <- as.numeric(not_usa_salary$MINsal)
not_usa_salary$MAXsal <- as.numeric(not_usa_salary$MAXsal)
#after, create factor column corresponding to original salary range columns
not_usa_salary <- not_usa_salary %>% 
  mutate(
    salary_range = paste0 (
  format(MINsal, trim = TRUE),
  "-", 
  MAXsal
),
salary_range = fct_reorder(salary_range, MINsal)
  )

#convert values to "rest of world"
not_usa_salary['Q4'] <- "Rest of World"
#combine the two dataframes by row (not by column)
usa_v_row_sal <- rbind(usa_sal_range, not_usa_salary)
usa_v_row_sal <- usa_v_row_sal %>% rename("country" = "Q4")
usa_v_row_sal

attempt to plot histogram with USA salary range vs Rest of World

ggplot(usa_v_row_sal, aes(x=salary_range, fill=country, color=country)) +
  
  geom_bar(color="white", width=1, linewidth=0) + coord_flip() + theme_minimal() +
  
  scale_fill_manual(values=c("grey90","skyblue")) + 
  
  labs(x="salary range, annual (USD)", y="count") +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = c(0.87, 0.25))

ggsave("salary_distribution.png")
## Saving 8 x 6 in image

Section B. Job Dynamics

Part 1.Salary Range Density, by Job title, by Gender

gender_job
gender_paygap
salrange_density <- df1 %>% select("Q3","Q4", "Q23", "Q29")
salrange_density <- salrange_density %>% 
  #filter(Q4 == "United States of America") %>% #filter to US salaries only
  filter(Q3 == "Man" | Q3 == "Woman") %>% #confine to man/woman
  filter(Q23 != "Currently not employed" & Q23 != "Other") %>% #remove unemployed/other
  filter(Q29 != ">$1,000,000") %>% #remove millionaires 
  mutate_all(na_if, "") #fill blank data with NA
salrange_density <- salrange_density %>% filter_at(vars(Q23, Q29), all_vars(!is.na(.))) 
#remove incomplete rows (left with ~7,000 rows)


salrange_density <- salrange_density %>% mutate(Q23 = replace(Q23, Q23 == "Data Analyst (Business, Marketing, Financial, Quantitative, etc)", "Data Analyst")) %>% mutate(Q23 = replace(Q23, Q23 == "Manager (Program, Project, Operations, Executive-level, etc)", "Manager")) 
#simplifying strings of some job roles
#split salary range column(Q29), turn to numeric dtype, then add factor type column
salrange_density <- salrange_density  %>% mutate(across(starts_with("Q29"), ~gsub("\\$", "", .))) %>% mutate(across(starts_with("Q29"), ~gsub("\\,", "", .)))
#strip out $ and commas

salrange_density <- salrange_density %>% separate(Q29, c("MINsal", "MAXsal"))
#string split

salrange_density$MINsal <- as.numeric(salrange_density$MINsal)
salrange_density$MAXsal <- as.numeric(salrange_density$MAXsal)
#as numeric conversion

salrange_density <- salrange_density %>% 
  mutate(
    salary_range = paste0 (
  format(MINsal, trim = TRUE),
  "-", 
  MAXsal
),
salary_range = fct_reorder(salary_range, MINsal)
  )
#re-combine newly separated column and create a corresponding factor column
salrange_density <- salrange_density %>% rename("gender" = "Q3", "job_title" = "Q23")
head(salrange_density)
#col_grid <- rgb(235, 235, 235, 100, maxColorValue = 300)

visualization attempt

ggplot(salrange_density, aes(x=job_title, y=salary_range)) +
  
  geom_jitter(aes(colour=gender), height = 0.3, width = 0.3, alpha=0.8)+
  
  scale_color_manual(values=c("deepskyblue","deeppink")) +
  
   labs(x="job title", y="salary range, annual (USD)") +
  
  theme_minimal() +
  
  theme(axis.text.x = element_text(angle = 45, hjust=1),
        axis.text = element_text(size=rel(2)),
        axis.title = element_text(size=rel(1)))

#ggsave("whiteUSsalary_density.png", width = 7000, height = 4096, units="px")

scatter brain

Part 1. gender ratiod donut chart
df1 %>% select(Q2) %>% gather("key", "value") %>% group_by(value) %>% summarise(n=n()) %>% arrange((value))
age_demo <- df1 %>% select(Q2) %>% gather("key", "value") %>% group_by(value) %>% summarise(n=n()) %>% arrange((value))
ggplot(age_demo, aes(x=n, y=value)) +
  geom_col(fill="gold") +
  scale_x_continuous(expand = c(0, 0)) + 
  scale_x_reverse() +
  scale_y_discrete(position="right") +
  geom_vline(xintercept = 0, linetype="dashed") +
  theme_minimal() +
  labs(x="count", y="") +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.

#ggsave("age_demographics.png")
age_demo
Part 2. programming donutchart
df1 %>% select(contains("Q12_")) %>% gather("key", "value") %>% mutate_all(na_if, "") %>% drop_na() %>% group_by(value) %>% summarise(n=n()) %>% arrange(desc(n))
pop_language <- df1 %>% select(contains("Q12_")) %>% gather("key", "value") %>% mutate_all(na_if, "") %>% drop_na() %>% group_by(value) %>% summarise(n=n()) %>% arrange(desc(n))

reformat keys to simplify our donut chart

pop_language <- pop_language %>% mutate(value = ifelse(n < 3000, "Other*", value)) %>% group_by(value) %>% summarise(n = sum(n)) %>% arrange(desc(n))
pop_language
pop_language$fraction <- pop_language$n / sum(pop_language$n)

pop_language$ymax <- cumsum(pop_language$fraction)

pop_language$ymin <- c(0, head(pop_language$ymax, n=-1))

# Compute label position
pop_language$labelPosition <- (pop_language$ymax + pop_language$ymin) / 2

pop_language$label <- paste0(pop_language$category, "\n value: ", pop_language$count)
## Warning: Unknown or uninitialised column: `category`.
## Warning: Unknown or uninitialised column: `count`.

making the plot

ggplot(pop_language, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=value)) +
  geom_rect() +
  geom_label(x=3.5, aes(y=labelPosition, label=value)) +
  #geom_label( x=3.5, aes(y=labelPosition, label=label), size=6) +
  scale_fill_brewer(palette= "Set2") +
  coord_polar(theta="y") +
  xlim(c(0, 4)) +
  theme_void() +
  theme(legend.position = "none") +
  labs(caption="*Other includes: C#, MATLAB, Bash, PHP, Go, and Julia")

#ggsave("donutchart_lang.png")
pop_language